from cvxpy import * import numpy as np import time, collections, os, errno, sys import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt from Visualization_function import visualize from solveCrossTime import * from scipy import stats from sklearn import mixture from sklearn import covariance import sklearn, random from sklearn.cluster import KMeans import pandas as pd pd.set_option('display.max_columns', 500) np.set_printoptions(formatter={'float': lambda x: "{0:0.4f}".format(x)}) np.random.seed(102) #####################PARAMETERS TO PLAY WITH window_size = 10 maxIters = 1000 ##number of Iterations of the smoothening + clustering algo beta = 400 ## Beta parameter lambda_parameter = 11e-2 ## Lambda regularization parameter number_of_clusters = 11 threshold = 2e-5##Threshold for plots. Not used in TICC algorithm. write_out_file = False ##Only if True are any files outputted seg_len = 300##segment-length : used in confusion matrix computation ##INPUT file location input_file = "Synthetic Data Matrix rand_seed =[0,1] generated2.csv" ##Folder name to store all the OUPUTS prefix_string = "data_lambda=" + str(lambda_parameter)+"beta = "+str(beta) + "clusters=" +str(number_of_clusters)+"/" ######################################################## ##parameters that are automatically set based upoon above num_blocks = window_size + 1 switch_penalty = beta## smoothness penalty lam_sparse = lambda_parameter##sparsity parameter maxClusters = number_of_clusters+1## Number of clusters + 1 write_out_file = False ##Only if True are any files outputted num_stacked = num_blocks - 1 ##colors used in hexadecimal format hexadecimal_color_list = ["cc0000","0000ff","003300","33ff00","00ffcc","ffff00","ff9900","ff00ff","cccc66","666666","ffccff","660000","00ff00","ffffff","3399ff","006666","330000","ff0000","cc99ff","b0800f","3bd9eb","ef3e1b"] ##The basic folder to be created str_NULL = prefix_string print "lam_sparse", lam_sparse print "switch_penalty", switch_penalty print "num_cluster", maxClusters - 1 print "num stacked", num_stacked Data = np.loadtxt(input_file, delimiter= ",") print "completed getting the data" Data_pre = Data UNNORMALIZED_Data = Data*1000 (m,n) = Data.shape len_D_total = m size_blocks = n ##Add an optimization function # def optimize(): def upper2Full(a, eps = 0): ind = (a<eps)&(a>-eps) a[ind] = 0 n = int((-1 + np.sqrt(1+ 8*a.shape[0]))/2) A = np.zeros([n,n]) A[np.triu_indices(n)] = a temp = A.diagonal() A = np.asarray((A + A.T) - np.diag(temp)) return A def updateClusters(LLE_node_vals,switch_penalty = 1): """ Uses the Viterbi path dynamic programming algorithm to compute the optimal cluster assigments Takes in LLE_node_vals matrix and computes the path that minimizes the total cost over the path Note the LLE's are negative of the true LLE's actually!!!!! Note: switch penalty > 0 """ (T,num_clusters) = LLE_node_vals.shape future_cost_vals = np.zeros(LLE_node_vals.shape) ##compute future costs for i in xrange(T-2,-1,-1): j = i+1 indicator = np.zeros(num_clusters) future_costs = future_cost_vals[j,:] lle_vals = LLE_node_vals[j,:] for cluster in xrange(num_clusters): total_vals = future_costs + lle_vals + switch_penalty total_vals[cluster] -= switch_penalty future_cost_vals[i,cluster] = np.min(total_vals) ##compute the best path path = np.zeros(T) ##the first location curr_location = np.argmin(future_cost_vals[0,:] + LLE_node_vals[0,:]) path[0] = curr_location ##compute the path for i in xrange(T-1): j = i+1 future_costs = future_cost_vals[j,:] lle_vals = LLE_node_vals[j,:] total_vals = future_costs + lle_vals + switch_penalty total_vals[int(path[i])] -= switch_penalty path[i+1] = np.argmin(total_vals) ##return the computed path return path def find_matching(confusion_matrix): """ returns the perfect matching """ _,n = confusion_matrix.shape path = [] for i in xrange(n): max_val = -1e10 max_ind = -1 for j in xrange(n): if j in path: pass else: temp = confusion_matrix[i,j] if temp > max_val: max_val = temp max_ind = j path.append(max_ind) return path def computeF1Score_delete(num_cluster,matching_algo,actual_clusters,threshold_algo,save_matrix = False): """ computes the F1 scores and returns a list of values """ F1_score = np.zeros(num_cluster) for cluster in xrange(num_cluster): matched_cluster = matching_algo[cluster] true_matrix = actual_clusters[cluster] estimated_matrix = threshold_algo[matched_cluster] if save_matrix: np.savetxt("estimated_matrix_cluster=" + str(cluster)+".csv",estimated_matrix,delimiter = ",", fmt = "%1.4f") TP = 0 TN = 0 FP = 0 FN = 0 for i in xrange(num_stacked*n): for j in xrange(num_stacked*n): if estimated_matrix[i,j] == 1 and true_matrix[i,j] != 0: TP += 1.0 elif estimated_matrix[i,j] == 0 and true_matrix[i,j] == 0: TN += 1.0 elif estimated_matrix[i,j] == 1 and true_matrix[i,j] == 0: FP += 1.0 else: FN += 1.0 precision = (TP)/(TP + FP) recall = TP/(TP + FN) f1 = (2*precision*recall)/(precision + recall) F1_score[cluster] = f1 return F1_score def compute_confusion_matrix(num_clusters,clustered_points_algo, sorted_indices_algo): """ computes a confusion matrix and returns it """ seg_len = 200 true_confusion_matrix = np.zeros([num_clusters,num_clusters]) for point in xrange(len(clustered_points_algo)): cluster = int(clustered_points_algo[point]) ##CASE G: ABBACCCA # num = (int(sorted_indices_algo[point]/seg_len) ) # if num in [0,3,7]: # true_confusion_matrix[0,cluster] += 1 # elif num in[1,2]: # true_confusion_matrix[1,cluster] += 1 # else: # true_confusion_matrix[2,cluster] += 1 ##CASE F: ABCBA # num = (int(sorted_indices_algo[point]/seg_len)) # num = min(num, 4-num) # true_confusion_matrix[num,cluster] += 1 #CASE E : ABCABC num = (int(sorted_indices_algo[point]/seg_len) %num_clusters) true_confusion_matrix[num,cluster] += 1 ##CASE D : ABABABAB # num = (int(sorted_indices_algo[point]/seg_len) %2) # true_confusion_matrix[num,cluster] += 1 ##CASE C: # num = (sorted_indices_algo[point]/seg_len) # if num < 15: # true_confusion_matrix[0,cluster] += 1 # elif num < 20: # true_confusion_matrix[1,cluster] += 1 # else: # true_confusion_matrix[0,cluster] += 1 ##CASE B : # if num > 4: # num = 9 - num # true_confusion_matrix[num,cluster] += 1 ##CASE A : ABA # if sorted_indices_algo[point] < seg_len: # true_confusion_matrix[0,cluster] += 1 # elif sorted_indices_algo[point] <3*seg_len: # true_confusion_matrix[1,cluster] += 1 # else: # true_confusion_matrix[0,cluster] += 1 return true_confusion_matrix def computeF1_macro(confusion_matrix,matching, num_clusters): """ computes the macro F1 score confusion matrix : requres permutation matching according to which matrix must be permuted """ ##Permute the matrix columns permuted_confusion_matrix = np.zeros([num_clusters,num_clusters]) for cluster in xrange(num_clusters): matched_cluster = matching[cluster] permuted_confusion_matrix[:,cluster] = confusion_matrix[:,matched_cluster] ##Compute the F1 score for every cluster F1_score = 0 for cluster in xrange(num_clusters): TP = permuted_confusion_matrix[cluster,cluster] FP = np.sum(permuted_confusion_matrix[:,cluster]) - TP FN = np.sum(permuted_confusion_matrix[cluster,:]) - TP precision = TP/(TP + FP) recall = TP/(TP + FN) f1 = stats.hmean([precision,recall]) F1_score += f1 F1_score /= num_clusters return F1_score def computeNetworkAccuracy(matching,train_cluster_inverse, num_clusters): """ Takes in the matching for the clusters takes the computed clusters computes the average F1 score over the network """ threshold = 1e-2 f1 = 0 for cluster in xrange(num_clusters): true_cluster_cov = np.loadtxt("Inverse Covariance cluster ="+ str(cluster) +".csv", delimiter = ",") matched_cluster = matching[cluster] matched_cluster_cov = train_cluster_inverse[matched_cluster] (nrow,ncol) = true_cluster_cov.shape out_true = np.zeros([nrow,ncol]) for i in xrange(nrow): for j in xrange(ncol): if np.abs(true_cluster_cov[i,j]) > threshold: out_true[i,j] = 1 out_matched = np.zeros([nrow,ncol]) for i in xrange(nrow): for j in xrange(ncol): if np.abs(matched_cluster_cov[i,j]) > threshold: out_matched[i,j] = 1 np.savetxt("Network_true_cluster=" +str(cluster) + ".csv",true_cluster_cov, delimiter = ",") np.savetxt("Network_matched_cluster=" + str(matched_cluster)+".csv",matched_cluster_cov, delimiter = ",") ##compute the confusion matrix confusion_matrix = np.zeros([2,2]) for i in xrange(nrow): for j in xrange(ncol): confusion_matrix[out_true[i,j],out_matched[i,j]] += 1 f1 += computeF1_macro(confusion_matrix, [0,1],2) return f1/num_clusters ############ if not os.path.exists(os.path.dirname(str_NULL)): try: os.makedirs(os.path.dirname(str_NULL)) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise def hex_to_rgb(value): """ Return (red, green, blue) values Input is hexadecimal color code: #rrggbb. """ lv = len(value) out = tuple(int(value[i:i + lv // 3], 16) for i in range(0, lv, lv // 3)) out = tuple([x/256.0 for x in out]) return out color_list = [] for hex_color in hexadecimal_color_list: rgb_color = hex_to_rgb(hex_color) color_list.append(rgb_color) colors = color_list train_cluster_inverse = {} log_det_values = {} computed_covariance = {} cluster_mean_info = {} cluster_mean_stacked_info = {} old_clustered_points = np.zeros(10) for iters in xrange(maxIters): print "\n\n\nITERATION ###", iters num_clusters = maxClusters - 1 if iters == 0: ## Now splitting up stuff ## split1 : Training and Test ## split2 : Training and Test - different clusters training_percent = 0.90 training_idx = np.random.choice(m-num_blocks+1, size=int(m*training_percent),replace = False ) ##Ensure that the first and the last few points are in training_idx = list(training_idx) if 0 not in training_idx: training_idx.append(0) if m - num_stacked not in training_idx: training_idx.append(m-num_stacked) training_idx = np.array(training_idx) sorted_training_idx = sorted(training_idx) num_test_points = m - len(training_idx) test_idx = [] ##compute the test indices for point in xrange(m-num_stacked+1): if point not in sorted_training_idx: test_idx.append(point) sorted_test_idx = sorted(test_idx) # np.savetxt("sorted_training.csv", sorted_training_idx, delimiter = ",") # np.savetxt("sorted_test.csv", sorted_test_idx, delimiter = ",") ##Stack the complete data complete_Data = np.zeros([m - num_stacked + 1, num_stacked*n]) len_data = m for i in xrange(m - num_stacked + 1): idx = i for k in xrange(num_stacked): if i+k < len_data: idx_k = i + k complete_Data[i][k*n:(k+1)*n] = Data[idx_k][0:n] # np.savetxt("Complete_Data_stacked_rand_seed="+str(0)+".csv", complete_Data,delimiter=",") ##Stack the training data complete_D_train = np.zeros([len(training_idx), num_stacked*n]) len_training = len(training_idx) for i in xrange(len(sorted_training_idx)): idx = sorted_training_idx[i] for k in xrange(num_stacked): if i+k < len_training: idx_k = sorted_training_idx[i+k] complete_D_train[i][k*n:(k+1)*n] = Data[idx_k][0:n] # np.savetxt("Data_train_rand_seed="+str(0)+".csv", complete_D_train,delimiter=",") ##Stack the test complete_D_test = np.zeros([len(test_idx), num_stacked*n]) len_test = len(test_idx) for i in xrange(len(sorted_test_idx)): idx = sorted_test_idx[i] idx_left = idx -1 while idx_left not in sorted_training_idx: idx_left -= 1 point_tr = sorted_training_idx.index(idx_left) complete_D_test[i] = complete_D_train[point_tr] complete_D_test[i][0:n] = Data[idx][0:n] # np.savetxt("Data_test_rand_seed="+str(0)+".csv", complete_D_test,delimiter=",") #####INITIALIZATION!!! gmm = mixture.GaussianMixture(n_components=num_clusters, covariance_type="full") gmm.fit(complete_D_train) clustered_points = gmm.predict(complete_D_train) clustered_points_test = gmm.predict(complete_D_test) gmm_clustered_pts_test = gmm.predict(complete_D_test) gmm_clustered_pts = clustered_points + 0 gmm_covariances = gmm.covariances_ gmm_means = gmm.means_ ##USE K-means kmeans = KMeans(n_clusters = num_clusters,random_state = 0).fit(complete_D_train) clustered_points_kmeans = kmeans.labels_ clustered_points_test_kmeans = kmeans.predict(complete_D_test) true_confusion_matrix_g = compute_confusion_matrix(num_clusters,gmm_clustered_pts,sorted_training_idx) true_confusion_matrix_k = compute_confusion_matrix(num_clusters,clustered_points_kmeans,sorted_training_idx) ##Get the train and test points train_clusters = collections.defaultdict(list) test_clusters = collections.defaultdict(list) len_train_clusters = collections.defaultdict(int) len_test_clusters = collections.defaultdict(int) counter = 0 for point in range(len(clustered_points)): cluster = clustered_points[point] train_clusters[cluster].append(point) len_train_clusters[cluster] += 1 counter +=1 for point in range(len(clustered_points_test)): cluster = clustered_points_test[point] test_clusters[cluster].append(point) len_test_clusters[cluster] += 1 counter +=1 ##train_clusters holds the indices in complete_D_train ##for each of the clusters for cluster in xrange(num_clusters): if len_train_clusters[cluster] != 0: indices = train_clusters[cluster] indices_test = test_clusters[cluster] D_train = np.zeros([len_train_clusters[cluster],num_stacked*n]) for i in xrange(len_train_clusters[cluster]): point = indices[i] D_train[i,:] = complete_D_train[point,:] D_test = np.zeros([len_test_clusters[cluster], num_stacked*n]) for i in xrange(len_test_clusters[cluster]): point = indices_test[i] D_test[i,:] = complete_D_test[point,:] print "stacking Cluster #", cluster,"DONE!!!" ##Fit a model - OPTIMIZATION size_blocks = n probSize = num_stacked * size_blocks lamb = np.zeros((probSize,probSize)) + lam_sparse S = np.cov(np.transpose(D_train) ) print "starting the OPTIMIZATION" #Set up the Toeplitz graphical lasso problem gvx = TGraphVX() theta = semidefinite(probSize,name='theta') obj = -log_det(theta) + trace(S*theta) gvx.AddNode(0, obj) gvx.AddNode(1) dummy = Variable(1) gvx.AddEdge(0,1, Objective = lamb*dummy + num_stacked*dummy + size_blocks*dummy) ##solve using customized ADMM solver gvx.Solve(Verbose=False, MaxIters=1000, Rho = 1, EpsAbs = 1e-6, EpsRel = 1e-6) #THIS IS THE SOLUTION val = gvx.GetNodeValue(0,'theta') S_est = upper2Full(val, 0) X2 = S_est u, _ = np.linalg.eig(S_est) cov_out = np.linalg.inv(X2) inv_matrix = cov_out ##Store the log-det, covariance, inverse-covariance, cluster means, stacked means log_det_values[num_clusters, cluster] = np.log(np.linalg.det(cov_out)) computed_covariance[num_clusters,cluster] = cov_out cluster_mean_info[num_clusters,cluster] = np.mean(D_train, axis = 0)[(num_stacked-1)*n:num_stacked*n].reshape([1,n]) cluster_mean_stacked_info[num_clusters,cluster] = np.mean(D_train,axis=0) train_cluster_inverse[cluster] = X2 cluster_norms = list(np.zeros(num_clusters)) # for cluster in xrange(num_clusters): # print "length of the cluster ", cluster,"------>", len_train_clusters[cluster] ##Computing the norms if iters != 0: for cluster in xrange(num_clusters): cluster_norms[cluster] = (np.linalg.norm(old_computed_covariance[num_clusters,cluster]),cluster) sorted_cluster_norms = sorted(cluster_norms,reverse = True) ##Add a point to the empty clusters ##Assumption more non empty clusters than empty ones counter = 0 for cluster in xrange(num_clusters): if len_train_clusters[cluster] == 0: ##Add a point to the cluster while len_train_clusters[sorted_cluster_norms[counter][1]] == 0: # print "counter is:", counter counter += 1 counter = counter % num_clusters # print "counter is:", counter cluster_selected = sorted_cluster_norms[counter][1] # print "cluster that is zero is:", cluster, "selected cluster instead is:", cluster_selected break_flag = False while not break_flag: point_num = random.randint(0,len(clustered_points)) if clustered_points[point_num] == cluster_selected: clustered_points[point_num] = cluster computed_covariance[num_clusters,cluster] = old_computed_covariance[num_clusters,cluster_selected] cluster_mean_stacked_info[num_clusters,cluster] = complete_D_train[point_num,:] cluster_mean_info[num_clusters,cluster] = complete_D_train[point,:][(num_stacked-1)*n:num_stacked*n] break_flag = True counter += 1 old_train_clusters = train_clusters old_computed_covariance = computed_covariance ##Code -----------------------SMOOTHENING ##For each point compute the LLE print "beginning with the DP - smoothening ALGORITHM" LLE_all_points_clusters = np.zeros([len(clustered_points),num_clusters]) for point in xrange(len(clustered_points)): # print "Point #", point if point + num_stacked-1 < complete_D_train.shape[0]: for cluster in xrange(num_clusters): # print "\nCLuster#", cluster cluster_mean = cluster_mean_info[num_clusters,cluster] cluster_mean_stacked = cluster_mean_stacked_info[num_clusters,cluster] x = complete_D_train[point,:] - cluster_mean_stacked[0:(num_blocks-1)*n] cov_matrix = computed_covariance[num_clusters,cluster][0:(num_blocks-1)*n,0:(num_blocks-1)*n] inv_cov_matrix = np.linalg.inv(cov_matrix) log_det_cov = np.log(np.linalg.det(cov_matrix))# log(det(sigma2|1)) lle = np.dot( x.reshape([1,(num_blocks-1)*n]), np.dot(inv_cov_matrix,x.reshape([n*(num_blocks-1),1])) ) + log_det_cov LLE_all_points_clusters[point,cluster] = lle ##Update cluster points - using dynamic programming smoothening clustered_points = updateClusters(LLE_all_points_clusters,switch_penalty = switch_penalty) print "\ncompleted smoothening algorithm" print "\n\nprinting the length of points in each cluster" for cluster in xrange(num_clusters): print "length of cluster #", cluster, "-------->", sum([x== cluster for x in clustered_points]) true_confusion_matrix = np.zeros([num_clusters,num_clusters]) ##Save a figure of segmentation # print "length of clustered_points", len(clustered_points) # print "length of sorted training_idx", len(sorted_training_idx) plt.figure() plt.plot(sorted_training_idx[0:len(clustered_points)],clustered_points,color = "r")#,marker = ".",s =100) plt.ylim((-0.5,num_clusters + 0.5)) # plt.savefig("TRAINING_EM_lam_sparse="+str(lam_sparse) + "switch_penalty = " + str(switch_penalty) + ".jpg") plt.close("all") plt.figure() plt.plot(sorted_training_idx[0:len(gmm_clustered_pts)],gmm_clustered_pts,color = "r")#,marker=".",s=100) plt.ylim((-0.5,num_clusters + 0.5)) # plt.savefig("TRAINING_GMM_lam_sparse="+str(lam_sparse) + "switch_penalty =" + str(switch_penalty)+ ".jpg") plt.close("all") # print "Done writing the figure" # print "Done writing the figure" true_confusion_matrix = compute_confusion_matrix(num_clusters,clustered_points,sorted_training_idx) # print "TRAINING TRUE confusion MATRIX:\n", true_confusion_matrix ####TEST SETS STUFF ### The closest point in training set is the cluster ### LLE + swtiching_penalty clustered_test = np.zeros(len(clustered_points_test)) for point in xrange(len(clustered_points_test)): idx = sorted_test_idx[point] ##Get the 2 closest points from training idx1 = idx + 1 while (idx1 not in sorted_training_idx and idx1 < m): idx1 += 1 idx2 = idx -1 while (idx2 not in sorted_training_idx and idx2 > -1): idx2 -= 1 if idx1 == m or idx2 == -1: print "idx1 :", idx1 == m print "idx2 :", idx2 == -1 print "point is:", point print "idx of the point is:", idx print "control should NOT reach here!!!!!!!" break vals = np.zeros(num_clusters) right_clust = clustered_points[sorted_training_idx.index(idx1)] left_clust = clustered_points[sorted_training_idx.index(idx2)] point_tr = sorted_training_idx.index(idx2) data_tt = complete_D_train[point_tr,:] data_tt[0:n] = Data[idx,:]#complete_D_test[point,0:num_stacked] for cluster in xrange(num_clusters): cluster_mean = cluster_mean_info[num_clusters,cluster] cluster_mean_stacked = cluster_mean_stacked_info[num_clusters,cluster] x = data_tt - cluster_mean_stacked[0:(num_blocks-1)*n] cov_matrix = computed_covariance[num_clusters,cluster][0:(num_blocks-1)*n,0:(num_blocks-1)*n] inv_cov_matrix = np.linalg.inv(cov_matrix) log_det_cov = np.log(np.linalg.det(cov_matrix))# log(det(sigma2|1)) lle = np.dot( x.reshape([1,(num_blocks-1)*n]), np.dot(inv_cov_matrix,x.reshape([n*(num_blocks-1),1])) ) + log_det_cov vals[cluster] = lle + switch_penalty*(cluster !=left_clust ) + switch_penalty*(cluster != right_clust ) out = np.argmin(vals) clustered_test[point] = out plt.figure() plt.plot(sorted_test_idx[0:len(clustered_test)],clustered_test,color = "r")#,marker = ".", s= 100) plt.ylim((-0.5,num_clusters + 0.5)) # plt.savefig("TEST_EM_lam_sparse="+str(lam_sparse) +"switch_penalty="+str(switch_penalty)+ ".jpg") plt.close("all") # print "done writing" ##GMM - TEST PREDICTIONS clustered_test_gmm = np.zeros(len(clustered_points_test)) for point in xrange(len(clustered_points_test)): idx = sorted_test_idx[point] ##Get the 2 closest points from training idx1 = idx + 1 while (idx1 not in sorted_training_idx and idx1 < m): idx1 += 1 idx2 = idx -1 while (idx2 not in sorted_training_idx and idx2 > -1): idx2 -= 1 if idx1 == m or idx2 == -1: print "idx1 :", idx1 == m print "idx2 :", idx2 == -1 print "point is:", point print "idx of the point is:", idx print "SOMETHING WRONG!!!!!!!" break vals = np.zeros(num_clusters) right_clust = gmm_clustered_pts[sorted_training_idx.index(idx1)] left_clust = gmm_clustered_pts[sorted_training_idx.index(idx2)] point_tr = sorted_training_idx.index(idx2) data_tt = complete_D_train[point_tr,:] data_tt[0:n] = Data[idx,:]#complete_D_test[point,0:num_stacked] for cluster in xrange(num_clusters): cluster_mean_stacked = gmm_means[cluster] x = data_tt - cluster_mean_stacked[0:(num_blocks-1)*n] cov_matrix = gmm_covariances[cluster][0:(num_blocks-1)*n,0:(num_blocks-1)*n] inv_cov_matrix = np.linalg.inv(cov_matrix) log_det_cov = np.log(np.linalg.det(cov_matrix)) lle = np.dot( x.reshape([1,(num_blocks-1)*n]), np.dot(inv_cov_matrix,x.reshape([n*(num_blocks-1),1])) ) + log_det_cov vals[cluster] = lle #+ switch_penalty*(cluster !=left_clust ) + switch_penalty*(cluster != right_clust ) out = np.argmin(vals) clustered_test_gmm[point] = out plt.figure() plt.plot(sorted_test_idx[0:len(clustered_test_gmm)],clustered_test_gmm,color = "r")#,marker = ".", s= 100) plt.ylim((-0.5,num_clusters + 0.5)) # plt.savefig("TEST_GMM_lam_sparse="+str(lam_sparse) + ".jpg") plt.close("all") # print "done writing" plt.figure() plt.plot(sorted_test_idx[0:len(clustered_points_test_kmeans)],clustered_points_test_kmeans,color = "r")#,marker = ".", s= 100) plt.ylim((-0.5,num_clusters + 0.5)) # plt.savefig("TEST_Modified_KMEANS_NEW_lam_sparse="+str(lam_sparse) + ".jpg") plt.close("all") # print "done writing" ##Segment length seg_len = 50 true_confusion_matrix_EM = compute_confusion_matrix(num_clusters,clustered_test,sorted_test_idx) true_confusion_matrix_GMM = compute_confusion_matrix(num_clusters,gmm_clustered_pts_test,sorted_test_idx) true_confusion_matrix_kmeans = compute_confusion_matrix(num_clusters,clustered_points_test_kmeans,sorted_test_idx) true_answers = np.zeros(len(clustered_points)) for point in xrange(len(clustered_points)): num = int(sorted_training_idx[point]/25.0) if num <10 : cluster = 0 elif num < 20: cluster = 1 else: cluster = 0 true_answers[point] = cluster # print "length of sorted_test_idx", len(sorted_test_idx) # print "length of true answers", len(true_answers) # print "new length", len(sorted_training_idx[0:len(clustered_points)]) plt.figure() plt.plot(sorted_training_idx[0:len(clustered_points)],true_answers,color = "k") plt.ylim((-0.5,num_clusters + 0.5)) # plt.savefig("True Output modifed lam_sparse=" + str(lam_sparse)+ ".jpg") plt.close("all") binary_EM = (true_confusion_matrix_EM[0,0] + true_confusion_matrix_EM[1,1])/len(clustered_points_test) binary_EM = np.max([binary_EM,1 -binary_EM]) # print "EM is -------->",binary_EM binary_GMM = (true_confusion_matrix_GMM[0,0] + true_confusion_matrix_GMM[1,1])/len(clustered_points_test) binary_GMM = np.max([binary_GMM,1-binary_GMM]) binary_Kmeans = (true_confusion_matrix_kmeans[0,0] + true_confusion_matrix_kmeans[1,1])/len(clustered_points_test) binary_Kmeans = np.max([binary_Kmeans,1-binary_Kmeans]) # print "TEST EM TRUE confusion MATRIX:\n", true_confusion_matrix_EM # print "TEST GMM TRUE confusion MATRIX:\n", true_confusion_matrix_GMM # print "TEST KMEANS TRUE CONFUSION MATRIX:\n", true_confusion_matrix_kmeans ##Create the F1 score from the graphs from k-means and GMM ##Get the train and test points train_inverse_covariance_kmeans = {} train_inverse_covariance_gmm = {} counter = 0 for cluster in xrange(num_clusters): ##GMM out = [(x == cluster) for x in gmm_clustered_pts] len_cluster = sum(out) D_train = np.zeros([len_cluster,num_stacked*n]) counter = 0 for point in xrange(len(gmm_clustered_pts)): if gmm_clustered_pts[point] == cluster: D_train[counter,:] = complete_D_train[point,:] counter += 1 train_inverse_covariance_gmm[cluster] = np.linalg.inv(gmm_covariances[cluster]) ##Kmeans out = [(x == cluster) for x in clustered_points_kmeans] len_cluster = sum(out) D_train = np.zeros([len_cluster,num_stacked*n]) counter2 = 0 for point in xrange(len(clustered_points_kmeans)): if clustered_points_kmeans[point] == cluster: D_train[counter2,:] = complete_D_train[point,:] counter2 += 1 train_inverse_covariance_kmeans[cluster] = X2 ##Using inverses from kmeans, GMM and EM threshold = 1e-5 ##Thresholding threshold_kmeans = {} threshold_GMM = {} threshold_EM = {} threshold_actual = {} ##Change this to add thresholding function ##Kmeans - thresholding for cluster in xrange(num_clusters): out = np.zeros(train_inverse_covariance_kmeans[0].shape, dtype = np.int) A = train_inverse_covariance_kmeans[cluster] for i in xrange(out.shape[0]): for j in xrange(out.shape[1]): if (np.abs(A[i,j]) > threshold): out[i,j] = 1 threshold_kmeans[cluster] = out ##GMM - thresholding for cluster in xrange(num_clusters): out = np.zeros(train_inverse_covariance_gmm[0].shape, dtype = np.int) A = train_inverse_covariance_gmm[cluster] for i in xrange(out.shape[0]): for j in xrange(out.shape[1]): if np.abs(A[i,j]) > threshold: out[i,j] = 1 threshold_GMM[cluster] = out ## EM - thresholding for cluster in xrange(num_clusters): out = np.zeros(train_inverse_covariance_gmm[0].shape, dtype = np.int) A = train_cluster_inverse[cluster] for i in xrange(out.shape[0]): for j in xrange(out.shape[1]): if np.abs(A[i,j]) > threshold: out[i,j] = 1 threshold_EM[cluster] = out ##compute the matching ##Assume its a 2x2 matrix? actual_clusters = {} for cluster in xrange(num_clusters): actual_clusters[cluster] = np.loadtxt("Inverse Covariance cluster =" + str(cluster)+".csv", delimiter = ",") ##compute the appropriate matching matching_Kmeans = find_matching(true_confusion_matrix_kmeans) matching_GMM = find_matching(true_confusion_matrix_GMM) matching_EM = find_matching(true_confusion_matrix_EM) correct_EM = 0 correct_GMM = 0 correct_KMeans = 0 for cluster in xrange(num_clusters): matched_cluster_EM = matching_EM[cluster] matched_cluster_GMM = matching_GMM[cluster] matched_cluster_Kmeans = matching_Kmeans[cluster] correct_EM += true_confusion_matrix_EM[cluster,matched_cluster_EM] correct_GMM += true_confusion_matrix_GMM[cluster,matched_cluster_GMM] correct_KMeans += true_confusion_matrix_kmeans[cluster, matched_cluster_Kmeans] # np.savetxt("computed estimated_matrix cluster =" + str(cluster) + ".csv", train_cluster_inverse[matched_cluster] , delimiter = ",", fmt = "%1.6f") binary_EM = correct_EM/len(clustered_points_test) binary_GMM = correct_GMM/len(clustered_points_test) binary_Kmeans = correct_KMeans/len(clustered_points_test) # print "\n\nKMEANS" # print "true confusion_matrix", true_confusion_matrix_kmeans # print "matching", matching_Kmeans # print "\n\nGMM" # print "true_confusion_matrix_GMM", true_confusion_matrix_GMM # print "matching GMM", matching_GMM # print "\n\nEM" # print "true confusion_matrix", true_confusion_matrix_EM # print "matching ", matching_EM print "\n\n\n" # print "The F1 scores is:", F1_EM,F1_GMM, F1_Kmeans # print "The binary accuracy", binary_EM, binary_GMM, binary_Kmeans if np.array_equal(old_clustered_points,clustered_points): print "\n\n\n\nCONVERGED!!! BREAKING EARLY!!!" break old_clustered_points = clustered_points ##Training confusion matrix train_confusion_matrix_EM = compute_confusion_matrix(num_clusters,clustered_points,sorted_training_idx) train_confusion_matrix_GMM = compute_confusion_matrix(num_clusters,gmm_clustered_pts,sorted_training_idx) train_confusion_matrix_kmeans = compute_confusion_matrix(num_clusters,clustered_points_kmeans,sorted_training_idx) test_confusion_matrix_EM = compute_confusion_matrix(num_clusters,clustered_test,sorted_test_idx) out = computeNetworkAccuracy(matching_EM, train_cluster_inverse,num_clusters) print "the network accuracy F1 score is:", out f1_EM_tr = computeF1_macro(train_confusion_matrix_EM,matching_EM,num_clusters) f1_GMM_tr = computeF1_macro(train_confusion_matrix_GMM,matching_GMM,num_clusters) f1_kmeans_tr = computeF1_macro(train_confusion_matrix_kmeans,matching_Kmeans,num_clusters) f1_EM_test = computeF1_macro(test_confusion_matrix_EM,matching_EM,num_clusters) # print "The TEST binary accuracy", binary_EM, binary_GMM, binary_Kmeans # print "\n\n" # print "TRAINING F1 score:", f1_EM_tr, f1_GMM_tr, f1_kmeans_tr # print "TEST F1 score:", f1_EM_test correct_EM = 0 correct_GMM = 0 correct_KMeans = 0 for cluster in xrange(num_clusters): matched_cluster_EM = matching_EM[cluster] matched_cluster_GMM = matching_GMM[cluster] matched_cluster_Kmeans = matching_Kmeans[cluster] correct_EM += train_confusion_matrix_EM[cluster,matched_cluster_EM] correct_GMM += train_confusion_matrix_GMM[cluster,matched_cluster_GMM] correct_KMeans += train_confusion_matrix_kmeans[cluster, matched_cluster_Kmeans] binary_EM = correct_EM/len(training_idx) binary_GMM = correct_GMM/len(training_idx) binary_Kmeans = correct_KMeans/len(training_idx) print "\n\n" print "\n\n\n" # print "The TRAINING binary accuracy", binary_EM, binary_GMM, binary_Kmeans # print "lam_sparse", lam_sparse # print "switch_penalty", switch_penalty # print "num_cluster", maxClusters - 1 # print "num stacked", num_stacked